/*
 Create_regression_var_from_CART_imputes -- 
 creates the variables needed for the FHS regressions (at least table 6)
 using the CART-completed datasets.
 This follows the variable construction in FHS's fix_outlier_ppsr50.sas program

 11/06/13;
 01/02/14: Modified to handle (multi-product) boxes CART imputations;
 05/09/14: Modified to duplicate what FHS did, but using CART-completed data. 

 04/13/15: Modified to use CART imputations for "all" variables. (Revision 4 of REStat paper.)
 04/21/15: Modified to use FHS's pimatx instead of pimat as the materials deflator (used for both lm and lm2).
           Modified mat and lm measure for concrete in 1992 to be the same as in other years (unlike for other industries).

**/

/* OPTIONS OBS=5000 NOREPLACE  ; */

%include "ASMimplibs.sas";




%macro chk(d,data,imptype);

*      d = product name;


       data all (drop = fdeath);
        set rev4.&d._&imptype;
	mer=1;
       run;

       /* MATCH ORIGINAL FHS "FINAL" DATASET WITH CART-COMPLETED DATA
        AND CONSTRUCT SOME VARIABLES USING THE MERGED DATASET  */

       proc sort data=all; by year ppn; run;

       /* For variables for which we did not make CART imputations (e.g., deflators and cost shares), just use FHS's
          original variable--there is no point in reconstructing what they already did for these
          variables.  For other variables, such as the log of real energy (le), rescaled physical output (phyq), the log of rescaled
          physical output (lphyq), nominal revenue (nq2), real revenue (q2), log real revenue (lq2), the price index (pbard), traditional
          real revenue (q3), the log of traditional real revenue (lq3), log prices and demand shocks, we have to reconstruct them. */
 
       data &d._orig (keep = ppn year fdeath dinv cw lkseq lksst piship aiake aiaks aial aiam aiae pien pimatx ppsr1); 
        set fhspen50.all&d.f; 
       run;
       
       proc sort data=&d._orig; by year ppn; run;

       data all in_FHS_not_in_CART;
        merge all (in=inall) &d._orig (in=inorig);
        by year ppn;
        if inall and inorig then output all;
        else if inorig then output in_FHS_not_in_CART;
       run;

       proc print data= in_FHS_not_in_CART;
         var ppn year tvs te sw kseq ksst ppsr1;
       run;

       /*  Get main variable edit/impute flags and merge them. */

       data &d.f_with_flags (keep = ppn year ee cf cr cp_orig nee_f_imp ncf_f_imp ncm_f_imp ncp_f_imp ntvs_f_imp);
        set dbflags.phy&d.f_with_flags;
	if aphyflag>0;
	if ppsr1>0.50;
        cp_orig = cp;
       run;

       proc sort data=&d.f_with_flags; by year ppn; run;
       proc sort data=all; by year ppn;

       data all;
        merge all (in=inall) &d.f_with_flags (in=inflags);
        by year ppn;
        if inall and inflags then output all;
       run;

       /*  Get PQS and PV edit/impute flags and merge them with main dataset. */

        data &d.f (keep = ppn year pqs_imp_mean pv_imp_mean pv_orig);
         set &d.f;
          pv_orig = pv;
        run;

        proc sort data=&d.f; by ppn year; run;        
        proc sort data=all; by ppn year; run;
       
       data all;
         merge all (in=inall) &d.f (in=inpqsf);
         by ppn year; 
         if inall and inpqsf;
       run;

       data all; 
        set all;
        phyq = pqs/ppsr1;   * scaling up physical product quantity shipped by specialization ratio;
        if pqs > 0 then lqphy = log(phyq);  
        else lqphy = .;

        if pv_imp_mean>0 then pv = pvtvsratio*tvs;
        else pv = pv_orig;  * This keeps the original PV when PV was not imputed (in case TVS was imputed);
        if pqs>0 then price = pv/pqs;
        label price ="Constructed Price=PV/PQS";
	lprice=log(price);
        nq2 = price*phyq;
        nq = tvs+dinv;
        q = nq/piship;  

        ww = wwswratio*sw;
        if ww > 0 then th = ph*(sw/ww);
        if th > 0 then lth = log(th); 
        else lth = .;

        if nee_f_imp=1 or ncf_f_imp=1 then energy = energycmratio*plant_cm;
        else energy = sum(ee,cf);  * This keeps the original energy if both EE and CF were non-imputed;
        real_energy = energy/pien;
        if real_energy > 0 
        then le=log(real_energy);
        else le = .;               * log of energy inputs;


         if ncp_f_imp = 1 then cp = cpcmratio*plant_cm;
         else cp = cp_orig;
         mat2 = sum(cp,cw);
         mat2_real = mat2/pimatx;
         if mat2_real > 0 then lm2 = log(mat2_real);
         else lm2 = .;
                
         if year>1991 then do;
             **if "&d" = "conc" then cm_alt = sum(cp,cw,cr);  
             **else cm_alt = plant_cm - energy;
             cm_alt = plant_cm - energy;
         end;
         else do;  * prior to 1992, the cm variable did not include energy;
             cm_alt = sum(cp,cw,cr);
         end;
         mat = cm_alt/pimatx;
         if mat>0 then lm = log(mat);
         else lm = .;
 
       run;


	* 0) CREATE MEASURES OF PRODUCTIVITY;

        %macro make;
 
	     * starting by creating pbard;
	     * weighted geometric mean of price;
	     * weights are nq2;
	     data all;
	      set all;
	     run;

             proc sort data=all; by _impute_ year; run;

	     proc summary data=all nway;
	      class _impute_ year;
	      var lprice;		/* lprice=log(price); */
	      weight nq2;		/* nq2=pqs*price; */
	      output out=avgprice mean=lpbar;
	     run;	      

	     data avgprice;
	      set avgprice;
	      pbar=exp(lpbar);
	      mer=1;
	      run;

	      data p87 (keep=norm mer _impute_);
	       set avgprice;
	       if year=1987;
	       norm=pbar;
	       mer=1;
	       run;
	       
	      proc sort data=avgprice; by _impute_ mer; run;
	      proc sort data=p87;      by _impute_ mer; run;
  
	      data avgprice (keep=year pbard _impute_ );
	       merge avgprice p87;
	       by _impute_ mer ;
	       pbard=pbar/norm;
	      run;

	      proc sort data=all;      by _impute_ year;run;
	      proc sort data=avgprice; by _impute_ year;run;

	     data all;
	      merge all avgprice;
	      by _impute_ year;
	      q2=nq2/pbard;    /* nq2=price*phyq; */
	      q3=nq/pbard;     /* nq=tvs+dinv; */
              if q > 0 then lq = log(q);
	      if q2 > 0 then lq2 = log(q2);
	      if q3 > 0 then lq3 = log(q3);
	      ltfp  =   lq    -aiake *lkseq -aiaks *lksst -aial *lth -aiam *lm -aiae *le ;
	      ltfp2 =   lq2   -aiake *lkseq -aiaks *lksst -aial *lth -aiam *lm2 -aiae *le ;
	      ltfp3 =   lq3   -aiake *lkseq -aiaks *lksst -aial *lth -aiam *lm -aiae *le ;
              ltfpphy = lqphy -aiake *lkseq -aiaks *lksst -aial *lth -aiam *lm2 -aiae *le ;
	      if price>0 and pbard>0 then lprice2=log(price/pbard); * relative price;
	      counter=1;
	     run;       
      
         %mend;

	 %make;

	     title "regression vars for &d before corrections";run;
	     proc univariate data=all;var lprice lprice2 ltfp ltfp2 ltfpphy;run;

	     title "counter before corrections for &d";run;
	     proc means data=all;var counter;run;


	* 2) MISSING VALUE DELETIONS;

        /**** WE WANT TO USE THE SAME SAMPLE THAT FHS USED, SO DON'T DELETE ANY OBSERVATIONS. **/

             proc means data=all N NMISS noprint;
              var lprice lprice2 price pbard ltfp ltfp2 ltfpphy lq lq2 lqphy lm le ; 
              by _impute_ year;
              output out=missingcounts NMISS= ;
             run;

             proc contents data=missingcounts; run;

             title "Counts of missing values for key variables, &d";
             proc print data=missingcounts;
               var _impute_ year lprice lprice2 price pbard ltfp ltfp2 ltfpphy lq lq2 lqphy lm le;
             run;

        /**** WE WANT TO USE THE SAME SAMPLE THAT FHS USED, SO DON'T DELETE ANY OBSERVATIONS. **/
             
	/**     data all;
	      set all;
              if (lprice=. or lprice2=. or ltfp=. or ltfp2=. or ltfpphy=.) 
		 then delete;
	     run;
        ***/

	     proc sort data=all;by _impute_ year; run;

	     proc means data=all noprint;
               var counter;
               by _impute_ year;
               output out=counts_by_imp_year sum(counter)=count;
             run;

	     title "counter after missing value correction for &d";run;
             proc print data=	counts_by_imp_year;
              var _impute_ year	count;
             run;

	* 2) PRICE OUTLIER CORRECTIONS;
	     proc sort data=all;by _impute_ year; run;

	     proc univariate data=all noprint;
	      by _impute_ year;var price;
	      output out=pmed median=pmed;run;

             /* Changed this to be flagged 
                instead of dropping obs. 
                This way we can have the same sample
                size in each implicate. */

	     data all;
	      merge all pmed;
              by _impute_ year;
              /* if .1*pmed<=price<=10*pmed; */      
              if  price <.1*pmed or price>10*pmed then priceoutlier=1;
              else priceoutlier=0;  
	      run;

              data priceoutliers;
               set all;
               if priceoutlier=1;
              run;

	      proc means data=priceoutliers noprint;
               var counter;
               by _impute_ year;
              output out=priceoutliercounts sum(counter)=count;
              run;

	      title "price outlier counts by implicate and year for &d";run;

	      proc print data=priceoutliercounts ;
               var _impute_ year count;
              run;


       * 3) INPUT OUTLIER CORRECTIONS;

/***********  These corrections were made before the CART imputations, and we want to 
              use the same sample FHS used.  So we don't re-run this code. 


*********/
/*** Don't drop the outliers--outliers were already deleted from the 
     original data, and we want to keep the same sample size for every implicate
 
	       data all;
	        set all;
		if (iamrule < mattvs and mattvs <1) ;
		if ialrule < swtvs <1 ;
	    run;

*******/

       * 4) TRIMMING LTFPPHY TAILS;	 
	      proc sort data=all; by _impute_ year;   
	      proc univariate data=all noprint;
               by _impute_ year;
	       var ltfpphy;
	       output out=trim pctlpre =ltfpp pctlpts = 1 99;
	      run;

            /***** 
                DON'T TRIM THE LTFPPHY TAILS--THIS WAS DONE IN THE ORIGINAL
                fix_outlier_ppsr50.sas program, presumably to deal with measurement error.
                The CART draws imputations from the existing observations, which have already
                been trimmed.
            ****/

	     data all; 
	       merge all trim;
               by _impute_ year;
	       ** if ltfpp1<=ltfpphy<=ltfpp99 then ltfpphy=ltfpphy;
               ** else delete;
	       if ltfpp1<=ltfpphy<=ltfpp99 then ltfpphy_outlier=0;
               else ltfpphy_outlier=1;
	     run;

	     /**proc univariate data=all; 
	       var _impute_ ltfpphy;
               by year;
	     run;

             title "ltfpphy outliers";run;
	     proc freq data=all;
               tables ltfpphy_outlier;
               by year;
             run;
             **/
          

       * 5) NOW HAVE TO REDO PRICE2;	    
	     data all;
	       set all (drop=pbard q2 lq2 ltfp2 lprice2); 
	     run;

	     %make;



       * 6) AND THE FINAL DATA IS;
	     title "regression vars AFTER corrections for &d, imputation 1";run;
	     proc univariate data=all; var lprice lprice2 ltfp ltfp2 ltfpphy;
              where _impute_=1;
             run;

	      proc means data=all noprint;
               var counter; 
               by _impute_;
               output out=count_at_end sum(counter)=count;  
              run;

	      title "counter after everything for &d";run;
	      proc print data=count_at_end;
                var _impute_ count;
              run;

	      data rev4.all&d._post&data;
	        set all;
		run;

/**     title "Correlations for &d reported in Table";run;
     proc corr data=all;
      var ltfp ltfp2 ltfpphy lprice lprice2;
     run;
**/

%mend;


/*** Just getting the PQS and PV imputes for merging with the rest of the data.  ***/

 data bredf77;  set fhs7797.bredf77; year=1977; run;
 data bredf82;  set fhs7797.bredf82; year=1982; run;
 data bredf87;  set fhs7797.bredf87; year=1987; run;
 data bredf92;  set fhs7797.bredf92; year=1992; run;
 data bredf97;  set fhs7797.bredf97; year=1997; run;

 data bredf (keep = ppn year  pqs_imp_mean pv_imp_mean pv); 
  set bredf77 bredf82 bredf87 bredf92 bredf97;
  pv_imp_mean  = 0;
 run;



 data cofff77;  set fhs7797.cofff77; year=1977; run;
 data cofff82;  set fhs7797.cofff82; year=1982; run;
 data cofff87;  set fhs7797.cofff87; year=1987; run;
 data cofff92;  set fhs7797.cofff92; year=1992; run;
 data cofff97;  set fhs7797.cofff97; year=1997; run;

 data cofff (keep = ppn year  pqs_imp_mean pv_imp_mean  pv); 
  set cofff77 cofff82 cofff87 cofff92 cofff97;
 run;

 data icebf77;  set fhs7797.icebf77; year=1977; run;
 data icebf82;  set fhs7797.icebf82; year=1982; run;
 data icebf87;  set fhs7797.icebf87; year=1987; run;
 data icebf92;  set fhs7797.icebf92; year=1992; run;
 data icebf97;  set fhs7797.icebf97; year=1997; run;

 data icebf (keep = ppn year  pqs_imp_mean pv_imp_mean  pv); 
  set icebf77 icebf82 icebf87 icebf92 icebf97;
  pv_imp_mean  = 0;
 run;

 data icepf77;  set fhs7797.icepf77; year=1977; run;
 data icepf82;  set fhs7797.icepf82; year=1982; run;
 data icepf87;  set fhs7797.icepf87; year=1987; run;

 data icepf (keep = ppn year  pqs_imp_mean pv_imp_mean  pv); 
  set icepf77 icepf82 icepf87 ;
  pv_imp_mean  = 0;
 run;

 data flrf77;  set fhs7797.floorf77; year=1977; run;
 data flrf82;  set fhs7797.floorf82; year=1982; run;
 data flrf87;  set fhs7797.floorf87; year=1987; run;
 data flrf92;  set fhs7797.floorf92; year=1992; run;
 data flrf97;  set fhs7797.floorf97; year=1997; run;

 data flrf (keep = ppn year  pqs_imp_mean pv_imp_mean pv); 
  set flrf77 flrf82 flrf87 flrf92 flrf97;
 run;


 data plyf77;  set fhs7797.plyf77; year=1977; run;
 data plyf82;  set fhs7797.plyf82; year=1982; run;
 data plyf87;  set fhs7797.plyf87; year=1987; run;
 data plyf92;  set fhs7797.plyf92; year=1992; run;
 data plyf97;  set fhs7797.plyf97; year=1997; run;

 data plyf (keep = ppn year  pqs_imp_mean pv_imp_mean  pv); 
  set plyf77 plyf82 plyf87 plyf92 plyf97;
 run;

 data boxf77;  set fhs7797.boxf77; year=1977; run;
 data boxf82;  set fhs7797.boxf82; year=1982; run;
 data boxf87;  set fhs7797.boxf87; year=1987; run;

 data boxf (keep = ppn year  pqs_imp_mean pv_imp_mean  pv); 
  set boxf77 boxf82 boxf87;
  pv_imp_mean  = 0;
 run;

 data carbonf77;  set fhs7797.carbonf77; year=1977; run;
 data carbonf82;  set fhs7797.carbonf82; year=1982; run;
 data carbonf87;  set fhs7797.carbonf87; year=1987; run;
 data carbonf92;  set fhs7797.carbonf92; year=1992; run;
 data carbonf97;  set fhs7797.carbonf97; year=1997; run;

 data carbonf (keep = ppn year  pqs_imp_mean pv_imp_mean  pv); 
  set carbonf77 carbonf82 carbonf87 carbonf92 carbonf97;
  pv_imp_mean  = 0;
 run;


 data gasf77;  set fhs7797.gasf77; year=1977; run;
 data gasf82;  set fhs7797.gasf82; year=1982; run;
 data gasf87;  set fhs7797.gasf87; year=1987; run;
 data gasf92;  set fhs7797.gasf92; year=1992; run;
 data gasf97;  set fhs7797.gasf97; year=1997; run;

 data gasf (keep = ppn year  pqs_imp_mean pv_imp_mean  pv); 
  set gasf77 gasf82 gasf87 gasf92 gasf97;
 run;

 data concf77;  set fhs7797.concf77; year=1977; run;
 data concf82;  set fhs7797.concf82; year=1982; run;
 data concf87;  set fhs7797.concf87; year=1987; run;
 data concf92;  set fhs7797.concf92; year=1992; run;

 data concf (keep = ppn year  pqs_imp_mean pv_imp_mean  pv); 
  set concf77 concf82 concf87 concf92;
  pv_imp_mean  = 0;
 run;


%chk(carbon,CART,imputes);
%chk(box,CART,imputes);    
%chk(bred,CART,imputes);
%chk(coff,CART,imputes);
%chk(flr,CART,imputes); 
%chk(gas,CART,imputes); 
%chk(iceb,CART,imputes);
%chk(icep,CART,imputes);
%chk(ply,CART,imputes);
%chk(conc,CART,imputes);    


/** Now create datasets from CART-predicted datasets. */
/**
%chk(bred,pred,predicted);
%chk(carbon,pred,predicted);
%chk(coff,pred,predicted);
%chk(flr,pred,predicted); 
%chk(gas,pred,predicted); 
%chk(iceb,pred,predicted);
%chk(icep,pred,predicted);
%chk(ply,pred,predicted);
%chk(conc,pred,predicted);    
%chk(box,pred,predicted);    
**/
